This script compares GSVA scores for hallmark and KEGG pathways between PBTA samples with and without DNA repair gene germline PLP variants

Load packages and set directories

library(tidyverse)
library(tibble)
library(ComplexHeatmap)
library(circlize)

Set filepaths

root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "CLK1-splicing-impact-morpholino")
results_dir <- file.path(analysis_dir, "results")
plots_dir <- file.path(analysis_dir, "plots")
source(file.path(root_dir, "figures", "theme_for_plots.R"))

Set file paths

gsva_file <- file.path(results_dir, "expr_collapsed_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
de_gsva_file <- file.path(results_dir, "expr_collapsed_de_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
kegg_file <- file.path(results_dir, "expr_collapsed_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
de_kegg_file <- file.path(results_dir, "expr_collapsed_de_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
dna_file <- file.path(results_dir, "expr_collapsed_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")
de_dna_file <- file.path(results_dir, "expr_collapsed_de_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")

hall_splice_file <- file.path(results_dir, "expr_splice_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
hall_splice_onco_file <- file.path(results_dir, "expr_splice_onco_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
kegg_splice_file <- file.path(results_dir, "expr_splice_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
kegg_splice_onco_file <- file.path(results_dir, "expr_splice_onco_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
dna_splice_file <- file.path(results_dir, "expr_splice_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")
dna_splice_onco_file <- file.path(results_dir, "expr_splice_onco_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")

# reduce kegg pathways
hallmark_file <- file.path(root_dir, "analyses", "clustering_analysis", "input", "genesets.tsv")

Create histology file for grouping samples

hall_to_keep <- read_tsv(hallmark_file) %>%
  filter(Keep == "Yes")
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Geneset, Keep
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_ids <- c("CTRL1", "CTRL2", "CTRL3", "Treated1", "Treated2", "Treated3")
treatments <- c("Non-targeting Morpholino", 
                "Non-targeting Morpholino", 
                "Non-targeting Morpholino", 
                "CLK1 Exon 4 Morpholino", 
                "CLK1 Exon 4 Morpholino",
                "CLK1 Exon 4 Morpholino")
df <- tibble(sample_id = sample_ids, Treatment = treatments)

# read in scores and de results
hallmark_scores <- read_tsv(gsva_file) %>%
  filter(geneset %in% hall_to_keep$Geneset)
## Rows: 300 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways <- unique(hallmark_scores$geneset)

kegg_scores <- read_tsv(kegg_file) 
## Rows: 1116 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways <- unique(kegg_scores$geneset)

dna_scores <- read_tsv(dna_file)
## Rows: 36 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways <- unique(dna_scores$geneset)

hallmark_scores_de <- read_tsv(de_gsva_file) %>%
  filter(geneset %in% hall_to_keep$Geneset)
## Rows: 300 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways_de <- unique(hallmark_scores_de$geneset)

kegg_scores_de <- read_tsv(de_kegg_file)
## Rows: 1116 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways_de <- unique(kegg_scores_de$geneset)

dna_scores_de <- read_tsv(de_dna_file)
## Rows: 36 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways_de <- unique(dna_scores_de$geneset)

# splicing
hallmark_scores_sp <- read_tsv(hall_splice_file) %>%
  filter(geneset %in% hall_to_keep$Geneset)
## Rows: 282 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways_sp <- unique(hallmark_scores_sp$geneset)

kegg_scores_sp <- read_tsv(kegg_splice_file)
## Rows: 660 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways_sp <- unique(kegg_scores_sp$geneset)

dna_scores_sp <- read_tsv(dna_splice_file)
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways_sp <- unique(dna_scores_sp$geneset)

hallmark_scores_sp_onc <- read_tsv(hall_splice_onco_file) %>%
  filter(geneset %in% hall_to_keep$Geneset)
## Rows: 276 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways_sp_onc <- unique(hallmark_scores_sp_onc$geneset)

kegg_scores_sp_onc <- read_tsv(kegg_splice_onco_file)
## Rows: 648 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways_sp_onc <- unique(kegg_scores_sp_onc$geneset)

dna_scores_sp_onc <- read_tsv(dna_splice_onco_file)
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways_sp_onc <- unique(dna_scores_sp_onc$geneset)

Run GSVA score comparisons between samples with and without CLK1 morpholino

# Define repair gene source input directories 
treat_ids <- df %>%
  filter(Treatment == "CLK1 Exon 4 Morpholino") %>%
  pull(sample_id) %>%
  unique()

ctrl_ids <- df %>%
  filter(Treatment == "Non-targeting Morpholino") %>%
  pull(sample_id) %>%
  unique()

define data frame to store GSVA scores, differences between groups, and corresponding test statistic p-values

Assuming you have defined kegg_pathways, hallmark_pathways, dna_pathways lists

And kegg_scores, hallmark_scores, dna_scores data frames with columns ‘geneset’, ‘sample_id’, and ‘gsva_score’

treat_ids and ctrl_ids are vectors containing IDs for treated and control samples

# Lists of score data frames and pathway lists
score_sources <- list(kegg_scores, hallmark_scores, dna_scores, kegg_scores_de, hallmark_scores_de, dna_scores_de,
                      kegg_scores_sp, hallmark_scores_sp, dna_scores_sp, kegg_scores_sp_onc, hallmark_scores_sp_onc, dna_scores_sp_onc)
pathway_sources <- list(kegg_pathways, hallmark_pathways, dna_pathways, kegg_pathways_de, hallmark_pathways_de, dna_pathways_de,
                        kegg_pathways_sp, hallmark_pathways_sp, dna_pathways_sp, kegg_pathways_sp_onc, hallmark_pathways_sp_onc, dna_pathways_sp_onc)
score_names <- c("kegg", "hallmark", "dna_repair", "kegg_de", "hallmark_de", "dna_repair_de",
                 "kegg_sp", "hallmark_sp", "dna_repair_sp", "kegg_sp_onc", "hallmark_sp_onc", "dna_repair_sp_onc") # Names for output files

for (i in 1:length(score_sources)) {
  score_df <- score_sources[[i]]
  pathway_list <- pathway_sources[[i]]
  score_name <- score_names[i]
  
  gsva_res <- data.frame(pathway = pathway_list,
                         treat_score = rep(0, length(pathway_list)),
                         ctrl_score = rep(0, length(pathway_list)),
                         score_diff = rep(0, length(pathway_list)),
                         wilcox_stat = rep(0, length(pathway_list)),
                         pvalue = rep(0, length(pathway_list)))
  
 for (k in 1:length(pathway_list)) {
    treat_scores <- score_df[score_df$geneset == pathway_list[k] & score_df$sample_id %in% treat_ids, ]$gsva_score
    ctrl_scores <- score_df[score_df$geneset == pathway_list[k] & score_df$sample_id %in% ctrl_ids, ]$gsva_score
    
    # Ensure treat_scores and ctrl_scores are numeric vectors
    treat_scores <- as.numeric(treat_scores)
    ctrl_scores <- as.numeric(ctrl_scores)
    
    # Proceed with mean calculation and Wilcoxon test as before
    gsva_res$treat_score[k] <- mean(treat_scores, na.rm = TRUE)
    gsva_res$ctrl_score[k] <- mean(ctrl_scores, na.rm = TRUE)
    gsva_res$score_diff[k] <- gsva_res$treat_score[k] - gsva_res$ctrl_score[k]

    if (length(treat_scores) > 0 && length(ctrl_scores) > 0) {
      test_result <- tryCatch({
        wilcox.test(treat_scores, ctrl_scores, alternative = 'two.sided')
      }, warning = function(w) {
        return(list(statistic = NA, p.value = NA))
      }, error = function(e) {
        return(list(statistic = NA, p.value = NA))
      })
      
      gsva_res$wilcox_stat[k] <- test_result$statistic
      gsva_res$pvalue[k] <- test_result$p.value
    } else {
      gsva_res$wilcox_stat[k] <- NA
      gsva_res$pvalue[k] <- NA
    }
}

  
  # Write to output
  output_filename <- paste0("gsva_score_diff_", score_name, ".tsv")
  write_tsv(gsva_res, file.path(results_dir, output_filename))
}

Plot single samples

# create anno colors
top_annot <- df %>%
  dplyr::select(Treatment) %>%
  as.data.frame()
anno_col <- list(Treatment = c("Non-targeting Morpholino" = "#FFC20A", "CLK1 Exon 4 Morpholino" = "#0C7BDC"))

# Heatmap annotation
top_anno = HeatmapAnnotation(df = top_annot,
                         col = anno_col, show_legend = TRUE)
 
# plot
for (i in 1:length(score_sources)) {
  score_df <- score_sources[[i]] %>%
    mutate(geneset = gsub("HALLMARK_|KEGG_", "", geneset),
           geneset = gsub("_", " ", geneset))

  score_df_summarized <- score_df %>%
    mutate(Treatment = ifelse(grepl("CTRL", sample_id), "Non-targeting Morpholino", "CLK1 Exon 4 Morpholino")) %>%
    group_by(geneset, Treatment) %>%
    summarise(Mean_GSVA_Score = mean(gsva_score, na.rm = TRUE), .groups = 'drop')

# Re-order
ordering_metric <- score_df_summarized %>%
  filter(Treatment == "CLK1 Exon 4 Morpholino") %>%
  arrange(desc(Mean_GSVA_Score)) %>%
  pull(geneset)

# Apply this ordering to the 'geneset' factor
score_df_summarized$geneset <- factor(score_df_summarized$geneset, levels = ordering_metric)

  gsva_barplot <- ggplot(score_df_summarized, aes(x = geneset, y = Mean_GSVA_Score, fill = Treatment)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_Publication() +
  labs(x = "Geneset", y = "Mean GSVA score") +
  scale_fill_manual(values = c("Non-targeting Morpholino" = "#FFC20A", "CLK1 Exon 4 Morpholino" = "#0C7BDC")) +
  coord_flip() +
    theme(legend.position = "top") # Positions the legend at the top

    
  score_name <- score_names[i]
  gsva_scores_mat <- score_df %>%
  spread(., sample_id, gsva_score) %>%
  tibble::column_to_rownames("geneset") %>%
  as.matrix()
  
  if (i %in% c(1,4,7,10)){
    height <- 30
    width <- 12
    cell_width <- 3
  }
  else if (i %in% c(2,5,8,11)){
    height <- 10
    width <- 8
    cell_width <- 3
  }
  else if (i %in% c(3,6,9,12)){
    height <- 3.5
    width <- 5.5
    cell_width <- 3
  }
  
heatmap_obj <- Heatmap(
  gsva_scores_mat,
  col = colorRamp2(c(-.8, 0, .8), c("darkblue", "white", "red")),
  name = "GSVA Scores",
  na_col = "whitesmoke",
  width = unit(cell_width, "cm"),
  show_column_names = FALSE,
  cluster_columns = FALSE,
  row_names_gp = grid::gpar(fontsize = 12),
  top_annotation = top_anno,
  heatmap_legend_param = list(direction = "horizontal"),
)

gsva_heatmap <- draw(heatmap_obj, annotation_legend_side = "top", heatmap_legend_side = "top")


output_filename <- paste0("gsva_heatmap_", score_name, ".pdf")
pdf(file.path(plots_dir, output_filename),  height = height, width = width)
print(gsva_heatmap)
print(gsva_barplot)
dev.off()
}

Print session info

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggthemes_5.1.0        circlize_0.4.16       ComplexHeatmap_2.20.0
##  [4] lubridate_1.9.3       forcats_1.0.0         stringr_1.5.1        
##  [7] dplyr_1.1.4           purrr_1.0.2           readr_2.1.5          
## [10] tidyr_1.3.1           tibble_3.2.1          ggplot2_3.5.1        
## [13] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5        shape_1.4.6.1       rjson_0.2.21       
##  [4] xfun_0.44           bslib_0.7.0         GlobalOptions_0.1.2
##  [7] tzdb_0.4.0          vctrs_0.6.5         tools_4.4.0        
## [10] generics_0.1.3      stats4_4.4.0        parallel_4.4.0     
## [13] fansi_1.0.6         highr_0.11          cluster_2.1.6      
## [16] pkgconfig_2.0.3     RColorBrewer_1.1-3  S4Vectors_0.42.1   
## [19] lifecycle_1.0.4     farver_2.1.2        compiler_4.4.0     
## [22] munsell_0.5.1       codetools_0.2-20    clue_0.3-65        
## [25] htmltools_0.5.8.1   sass_0.4.9          yaml_2.3.8         
## [28] pillar_1.9.0        crayon_1.5.2        jquerylib_0.1.4    
## [31] cachem_1.1.0        magick_2.8.3        iterators_1.0.14   
## [34] foreach_1.5.2       tidyselect_1.2.1    digest_0.6.35      
## [37] stringi_1.8.4       labeling_0.4.3      rprojroot_2.0.4    
## [40] fastmap_1.2.0       colorspace_2.1-0    cli_3.6.2          
## [43] magrittr_2.0.3      utf8_1.2.4          withr_3.0.0        
## [46] scales_1.3.0        bit64_4.0.5         timechange_0.3.0   
## [49] rmarkdown_2.27      matrixStats_1.3.0   bit_4.0.5          
## [52] png_0.1-8           GetoptLong_1.0.5    hms_1.1.3          
## [55] evaluate_0.24.0     knitr_1.47          IRanges_2.38.1     
## [58] doParallel_1.0.17   rlang_1.1.4         Rcpp_1.0.12        
## [61] glue_1.7.0          BiocGenerics_0.50.0 vroom_1.6.5        
## [64] jsonlite_1.8.8      R6_2.5.1